R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. Html_fragmented is quicker to load than html, hence for this pipeline which genenerated a large mount of data, we have used html_fragment to report Rmarkdown.

Checklist for this pipeline

#1) The present analysis was done on MacOS, using knitToHtmlfragment.

To reuse this code for other datasets a) prepare mod_coded_metadata, mod, mod0 and pheno_sva variables relevant to the study of interst b) in this pipeline human gene symbols are used

SVA+LM or SVA+lmY or none and DEG_Limma normalization for study or experiment or batch correction and removal of gender

Step6: Load libraries, set working directory and Import data

#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/HuAgeDis_MSNDsplitArray(recoded)_7.71"
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
## Warning: package 'limma' was built under R version 3.5.1
library(sva)
## Warning: package 'sva' was built under R version 3.5.2
## Loading required package: mgcv
## Warning: package 'mgcv' was built under R version 3.5.2
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.5.2
## This is mgcv 1.8-28. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Warning: package 'genefilter' was built under R version 3.5.1
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 3.5.2
library(pamr)
## Warning: package 'pamr' was built under R version 3.5.2
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.5.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.5.2
library(ggplot2) # plot
## Warning: package 'ggplot2' was built under R version 3.5.2
library(ggmap)#plots
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(gplots)#plots
## Warning: package 'gplots' was built under R version 3.5.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
## Warning: package 'Hmisc' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(viridis)#for corelation plot
## Loading required package: viridisLite
library(corrplot)#for corelation plot
## corrplot 0.84 loaded
library(filesstrings)#for file organization
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 3.5.2
#Metadata
metadata=read.csv("Step5D_result_merge_GSE33000_to_GSE43490_metadata_SampleID.csv", header=T, sep=',')

#Expression data
Expr_GeneHu=read.csv("Step5D_result_merge_GSE33000_to_GSE43490_Expr_Gene_SampleID.csv", header =T, sep=',')
#Rename the gene symbol column from X to GeneHu
colnames(Expr_GeneHu)[colnames(Expr_GeneHu) == 'X'] <- 'GeneHu'
dim(metadata)
## [1] 784   9
dim(Expr_GeneHu)
## [1] 6924  785
#Need to make sure that the Gene IDs column labeled as GeneHu are unique to specify rownames with Gene IDs
Expr_GeneHu = Expr_GeneHu[!duplicated(Expr_GeneHu$GeneHu),]
dim(metadata)
## [1] 784   9
dim(Expr_GeneHu)
## [1] 6924  785
#We lost no Gene IDs as they are no duplicates in RNA-seq unline microarray
Expr_GeneHu_1=Expr_GeneHu[,-1] #get rid of extra column
rownames(Expr_GeneHu_1)=Expr_GeneHu$GeneHu 
Expr_GeneHu_1=Expr_GeneHu_1[complete.cases(Expr_GeneHu_1), ]
edata=Expr_GeneHu_1
#Optional filter low counts or else svobj step compalins about missing values and all zero rows
#Expr_GeneHu_2=Expr_GeneHu_1[rowSums(Expr_GeneHu_1)>0,]
#edata=Expr_GeneHu_2
#From here identify column numbers of interest for next step
colnames(metadata)
## [1] "X"              "Sample_Name"    "HuGroup_Name"   "Disease"       
## [5] "Age"            "Study"          "Gender"         "Tissue"        
## [9] "Disease_OwnCon"
metadata_1=metadata[,c("Disease","Age","Study","Gender","Tissue")]
#metadata_1=metadata[,4:8]
rownames(metadata_1)=metadata$Sample_Name
#View(metadata_1)
colnames(metadata_1)
## [1] "Disease" "Age"     "Study"   "Gender"  "Tissue"
#only keep the variables that will be considered in the mod, mod null and linear regression
#metadata_2=metadata_1[,1:5]
metadata_2=metadata_1[,c("Disease","Age","Study","Gender","Tissue")]
rownames(metadata_2)=rownames(metadata_1)
#Optional way to drop metadata pheno trait column. Instead of dropping metadata pheno trait column in this pipeline we only provide traits of interest in the model.matrix steps
#metadata_2=metadata_2[!names(metadata_2) %in% c("Tissue")]
pheno=metadata_2
#View(metadata_2)
#View(pheno)
colnames(pheno)
## [1] "Disease" "Age"     "Study"   "Gender"  "Tissue"
DT::datatable(pheno)
DT::datatable(edata[1:7,1:7])
dim(metadata)
## [1] 784   9
dim(pheno)
## [1] 784   5
dim(Expr_GeneHu) #dropped off 1 extra column
## [1] 6924  785
dim(edata)
## [1] 6924  784
#Make sure we did not lose any samples in expression data
#Here in the output the 1st two and the last two should be same i.e. the first and last samples are same
colnames(Expr_GeneHu[2])
## [1] "F_100_AD_GSM1423787"
colnames(edata[1])
## [1] "F_100_AD_GSM1423787"
colnames(Expr_GeneHu[22])
## [1] "F_41_HD_GSM1424344"
colnames(edata[21])
## [1] "F_41_HD_GSM1424344"

Step7: Preparing aggregate data expr or edata from metadata by variables

t_edata<-t(edata)
DT::datatable(t_edata[,1:7])
DT::datatable(pheno)
dim(t_edata)
## [1]  784 6924
dim(pheno)
## [1] 784   5
#bind the pheno and edata, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata<-cbind(pheno, t_edata)
dim(pheno_t_edata)
## [1]  784 6929
DT::datatable(pheno_t_edata[,1:10])
write.csv(pheno_t_edata, "Step7_result_pheno_t_edata_All_Group_aggregate.csv")

Aggregade by Disease

#select the Disease pheno drop others
pheno_t_edata_Disease=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "Gender", "Tissue")]
dim(pheno_t_edata_Disease)
## [1]  784 6925
DT::datatable(pheno_t_edata_Disease[,1:10])
#aggregate edata or expression for the Disease pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
setDT(pheno_t_edata_Disease)
pheno_t_edata_Disease_avg=pheno_t_edata_Disease[, lapply(.SD, mean), by = .(Disease)]
DT::datatable(pheno_t_edata_Disease_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Disease_avg=t(pheno_t_edata_Disease_avg[,-1])
colnames(t_pheno_t_edata_Disease_avg)=pheno_t_edata_Disease_avg$Disease
dim(t_pheno_t_edata_Disease_avg)
## [1] 6924    5
DT::datatable(t_pheno_t_edata_Disease_avg[1:10,])
write.csv(t_pheno_t_edata_Disease_avg, "Step7_result_edata_Disease_aggregate.csv")

Aggregade by Gender

#select the Gender pheno drop others
pheno_t_edata_Gender=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "Disease", "Tissue")]
dim(pheno_t_edata_Gender)
## [1]  784 6925
DT::datatable(pheno_t_edata_Gender[,1:10])
#aggregate edata or expression for the Gender pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Gender)
pheno_t_edata_Gender_avg=pheno_t_edata_Gender[, lapply(.SD, mean), by = .(Gender)]
DT::datatable(pheno_t_edata_Gender_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Gender_avg=t(pheno_t_edata_Gender_avg[,-1])
colnames(t_pheno_t_edata_Gender_avg)=pheno_t_edata_Gender_avg$Gender
dim(t_pheno_t_edata_Gender_avg)
## [1] 6924    2
DT::datatable(t_pheno_t_edata_Gender_avg[1:10,])
write.csv(t_pheno_t_edata_Gender_avg, "Step7_result_edata_Gender_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_Age=pheno_t_edata[!names(pheno_t_edata) %in% c("Disease", "Study", "Gender", "Tissue")]
dim(pheno_t_edata_Age)
## [1]  784 6925
DT::datatable(pheno_t_edata_Age[,1:10])
#aggregate edata or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Age)
pheno_t_edata_Age_avg=pheno_t_edata_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Age_avg=t(pheno_t_edata_Age_avg[,-1])
colnames(t_pheno_t_edata_Age_avg)=pheno_t_edata_Age_avg$Age
dim(t_pheno_t_edata_Age_avg)
## [1] 6924   77
DT::datatable(t_pheno_t_edata_Age_avg[1:10,])
write.csv(t_pheno_t_edata_Age_avg, "Step7_result_edata_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_Tissue=pheno_t_edata[!names(pheno_t_edata) %in% c("Disease", "Study", "Gender", "Age")]
dim(pheno_t_edata_Tissue)
## [1]  784 6925
DT::datatable(pheno_t_edata_Tissue[,1:10])
#aggregate edata or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Tissue)
pheno_t_edata_Tissue_avg=pheno_t_edata_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Tissue_avg=t(pheno_t_edata_Tissue_avg[,-1])
colnames(t_pheno_t_edata_Tissue_avg)=pheno_t_edata_Tissue_avg$Tissue
dim(t_pheno_t_edata_Tissue_avg)
## [1] 6924    3
DT::datatable(t_pheno_t_edata_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_Tissue_avg, "Step7_result_edata_Tissue_aggregate.csv")

Aggregade by Study

#select the Study pheno drop others
pheno_t_edata_Study=pheno_t_edata[!names(pheno_t_edata) %in% c("Disease", "Tissue", "Gender", "Age")]
dim(pheno_t_edata_Study)
## [1]  784 6925
DT::datatable(pheno_t_edata_Study[,1:10])
#aggregate edata or expression for the Study pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Study)
pheno_t_edata_Study_avg=pheno_t_edata_Study[, lapply(.SD, mean), by = .(Study)]
DT::datatable(pheno_t_edata_Study_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Study_avg=t(pheno_t_edata_Study_avg[,-1])
colnames(t_pheno_t_edata_Study_avg)=pheno_t_edata_Study_avg$Study
dim(t_pheno_t_edata_Study_avg)
## [1] 6924    7
DT::datatable(t_pheno_t_edata_Study_avg[1:10,])
write.csv(t_pheno_t_edata_Study_avg, "Step7_result_edata_Study_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_all_avg<-cbind(t_pheno_t_edata_Disease_avg, t_pheno_t_edata_Gender_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_Study_avg)
DT::datatable(head(t_pheno_t_edata_all_avg))
write.csv(t_pheno_t_edata_all_avg, "Step7_result_edata_All_Group_aggregate.csv")
#save pheno_t_edata, t_pheno_t_edata_Disease_avg, t_pheno_t_edata_Gender_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_Study_avg, t_pheno_t_edata_all_avg
save(pheno_t_edata, t_pheno_t_edata_Disease_avg, t_pheno_t_edata_Gender_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_Study_avg, t_pheno_t_edata_all_avg, file="edata_aggregate_data.RData")

Step7: Visualization by heatmap and correlation of aggregate data expr or edata from metadata by variables

pdf(file="Step7_plot_boxplot_corr_plot_edata_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot Disease Gender Age Tissue Study and all ########
#box plot on aggregate Disease
boxplot(t_pheno_t_edata_Disease_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Disease_avg", col="yellow")

#box plot on aggregate Gender
boxplot(t_pheno_t_edata_Gender_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Gender_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Tissue_avg", col="yellow")

#box plot on aggregate Study
boxplot(t_pheno_t_edata_Study_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Study_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_all_avg", col="yellow")

                    #########corr plot Disease Gender Age Tissue Study and all ########
#create cor matrix on aggregate Disease
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Disease_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Disease_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Gender
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Gender_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Gender_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Study
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Study_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Study_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_all_Group_corr_pval_aggregate.csv")

dev.off()
## quartz_off_screen 
##                 2
pdf(file="Step7_plot_density_edata_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot Disease Gender Age Tissue Study and all ########
#density plot on aggregate Disease
df=as.data.frame(t_pheno_t_edata_Disease_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Disease_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Disease",x="t_pheno_t_edata_Disease_avg", y = "Density")


#density plot on aggregate Gender
df=as.data.frame(t_pheno_t_edata_Gender_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Gender_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Gender",x="t_pheno_t_edata_Gender_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_Tissue_avg", y = "Density")

#density plot on aggregate Study
df=as.data.frame(t_pheno_t_edata_Study_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Study_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Study",x="t_pheno_t_edata_Study_avg", y = "Density")
  
#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_all_avg", y = "Density") 

dev.off()
## quartz_off_screen 
##                 2
save.image(file="done_pre_Step8.RData")

Step8: Preparing models for SVA (Surrogate variable adjustment)

#From here identify column numbers of interest for next step
colnames(metadata_1)
## [1] "Disease" "Age"     "Study"   "Gender"  "Tissue"
#create coded metadata file
#For multilevel variables, R automatically can create multiple dummy variables here from the columns containing character metadata
#http://jtleek.com/genstats/inst/doc/02_09_linear-regression.html#categorical-variables-with-multiple-levels
#https://bioconductor.org/packages/release/bioc/vignettes/sva/inst/doc/sva.pdf
#we will save this mod and use it as coded metadata file for WGCNA.Using 0 intercept splits Disease coded into 3 columns 

mod_coded_metadata = model.matrix(~0+Disease+Study+Gender+Tissue+Age, data=metadata_1)

write.csv(mod_coded_metadata, "Step8_result_merge_GSE33000_to_GSE43490_metadata_SampleID_coded.csv")
#From here identify column numbers of interest for mod and mod0  
colnames(pheno)
## [1] "Disease" "Age"     "Study"   "Gender"  "Tissue"
#full model matrix, variable of interest and other variables
#Put categorical variables ahead of numeric variables 
#Put variable that will be used in DEG_Limma for contrast first
#for combat requires variables in mod to be numeric

mod = model.matrix(~Study+Gender+Tissue+Disease+Age, data=pheno) 

DT::datatable(mod)
#null model matrix all except the variable of interest 
#Put categorical variables ahead of numeric variables if using model below

mod0 = model.matrix(~Study+Gender+Tissue, data=pheno) #did not use ~0+ as then Study will get split into two columns unlike Study in mod above where it is just one column

DT::datatable(mod0)

Step9: Doing SVA (Surrogate variable adjustment)

#For SVA

n.sv = num.sv(as.matrix(edata),mod,method="be") 
svobj = sva(as.matrix(edata),mod,mod0,n.sv=n.sv, B=20)
## Number of significant surrogate variables is:  96 
## Iteration (out of 20 ):1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20
#But if sva gives the following errors then use combat as shown in the next code chunk
#1) At least one covariate is confounded with batch! Please remove confounded covariates and reGEO_Accession ComBat
#2) Error in solve.default(t(mod) %*% mod) : Lapack routine dgesv: system is exactly singular: U[10,10] = 0

sva function returns a list of 4 components: 1) sv= matrix with columns corrosponding to estimate of surrogate variables 2) pprob.gam=probability that each gene is associated with one or more latent variables 3) pprob.b=probability that each gene is associated with our variable of interest 4) n.sv=number of surrogate variables

#if GEO_Accessionning SVA gave error as described above or even as an alternative to SVAlm can use ComBat which is part of the SVA package 

#edata_combatY = ComBat(dat=as.matrix(edata), batch=pheno$Study, mod=mod, par.prior=TRUE, prior.plots=FALSE, mean.only=TRUE)

#edata_combatY = ComBat(dat=as.matrix(edata), batch=pheno$Study, mod=mod, par.prior=TRUE, prior.plots=FALSE)

Step10: Adjust edata based on SVA (using LM or cleanY)

For WGCNA alternative 1: use edata or cleanY or lmY expression file

For WGCNA alternative 2: use edata or cleanY or lmY expression file retaining only DEG_Limma for WGCNA.

ComBat applies the adjustment directly to the edata so lmY steps are not required

colnames(pheno)
## [1] "Disease" "Age"     "Study"   "Gender"  "Tissue"
#LM step 1 of 2
#For multilevel variables, R automatically can create multiple dummy variables here from the columns containing character metadata
#http://jtleek.com/genstats/inst/doc/02_09_linear-regression.html#categorical-variables-with-multiple-levels
#Note here the column which we care about in the mod are NOT used pheno[c(-?)]
#pheno_sva=cbind(pheno[-c(pheno$Disease, pheno$Age)],svobj$sv) #this does not work for the pheno vector type
#pheno_sva=cbind(pheno[-c(1,2)],svobj$sv)

pheno_sva=cbind(pheno[-c(1)],svobj$sv) #keep Disease only

reglm_sva<-lapply(1:nrow(edata), function(x){lm(unlist(edata[x,])~.,data=pheno_sva)})
DT::datatable(pheno_sva)
#LM step 2 of 2
#after fitting with variables other than variable of interest the residula that is left over is essentially the effect of the variable of interest

residuals<-lapply(reglm_sva, function(x)residuals(summary(x)))
residuals<-do.call(rbind, residuals)
edata_lmY<-residuals+matrix(apply(edata, 1, mean), nrow=nrow(residuals), ncol=ncol(residuals))
rownames(edata_lmY)=rownames(edata)
rownames(residuals)=rownames(edata)
DT::datatable(edata_lmY[1:7,1:7])
dim(edata_lmY)
## [1] 6924  784
write.csv(edata_lmY, "Step10_result_edata_lmY.csv")

Clear up workspace and retain required dataframes

#Save .RData and clear environment to free up memory
save.image(file="temp.RData")
rm(list=ls())
gc()
##           used  (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells 3852217 205.8   12725224   679.7         NA   12725224   679.7
## Vcells 8456330  64.6 1411925488 10772.2     102400 1355694339 10343.2
#This 'reglm_sva' is a big object so we remove it as we have saved the other variables we need 
load(file="temp.RData")
rm(reglm_sva)
gc() 
##            used  (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells  3933233 210.1   20910045  1116.8         NA   12899274   688.9
## Vcells 77721223 593.0 1355512468 10341.8     102400 1355694339 10343.2

Step10: Visualization of before and after SVA lmY

Before SVA: edata After SVA and lmY: edata_lmY for lmY adjusted edata

#Summary Statistics edata
DT::datatable(summary(edata[1:7,1:7]))
write.csv(summary(edata),"Step10_result_summary_stats_edata.csv")

#Summary Statistics edata_lmY
DT::datatable(summary(edata_lmY[1:7,1:7]))
write.csv(summary(edata_lmY),"Step10_result_summary_stats_edata_lmY.csv")
pdf(file="Step10_plot_Visualization_boxplot_MDS_PC_edata_edata_lmY_edata_lmY.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#randomly select 16 samples to visualize change in exprssion data with normalization. Trying to visualiza all sample is too many and hard to see
col_sel=sample(ncol(edata), 16) #use number less than 16 is total samples is less than 16

#Before boxplot edata
par(mai=c(1,0.8,1,0.8))
boxplot(edata[,col_sel], outline=FALSE, las=2, cex=0.25, main="edata", col="yellow")

#After boxplot edata_lmY
par(mai=c(1,0.8,1,0.8))
boxplot(edata_lmY[,col_sel], outline=FALSE, las=2, cex=0.25, main="edata_lmY", col="yellow")



                            ############MDSplot chunk###############
#Before MDSplot edata
par(mai=c(1,0.8,1,0.8))
plotMDS(edata[,col_sel], col=condition_colors, legend= "all", main="edata", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
#After MDSplot edata_lmY
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_lmY[,col_sel], col=condition_colors, legend= "all", main="edata_lmY", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                         #######PCAplot chunk Disease colored by Study or batch########
#Before PCAplot edata, colored by Study or batch, labels Disease
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                           PC1         PC2        PC3
## F_100_AD_GSM1423787 -33.05874 -0.57609949  0.4348591
## F_104_CON_GSM663093 197.83254 34.44417247 -0.5855307
## F_18_HD_GSM1424374  -32.57047 -0.04571757 -1.2724809
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Disease,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)

#After PCAplot edata_lmY, colored by Study or batch, labels Disease
pca0=prcomp(t(edata_lmY), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                            PC1         PC2       PC3
## F_100_AD_GSM1423787  -3.460897  8.87646559 -2.814455
## F_104_CON_GSM663093 -23.363512 11.80882026 16.596438
## F_18_HD_GSM1424374    2.635769 -0.07995706  4.467857
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_lmY",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Disease,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)




                             #########PCAplot chunk Gender colored by Study or batch######
#Before PCAplot edata, colored by Study or batch, labels Gender
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                           PC1         PC2        PC3
## F_100_AD_GSM1423787 -33.05874 -0.57609949  0.4348591
## F_104_CON_GSM663093 197.83254 34.44417247 -0.5855307
## F_18_HD_GSM1424374  -32.57047 -0.04571757 -1.2724809
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Gender,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)

#After PCAplot edata_lmY, colored by Study or batch, labels Gender
pca0=prcomp(t(edata_lmY), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                            PC1         PC2       PC3
## F_100_AD_GSM1423787  -3.460897  8.87646559 -2.814455
## F_104_CON_GSM663093 -23.363512 11.80882026 16.596438
## F_18_HD_GSM1424374    2.635769 -0.07995706  4.467857
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_lmY",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Gender,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)



                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata, colored by Study or batch, labels Age
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                           PC1         PC2        PC3
## F_100_AD_GSM1423787 -33.05874 -0.57609949  0.4348591
## F_104_CON_GSM663093 197.83254 34.44417247 -0.5855307
## F_18_HD_GSM1424374  -32.57047 -0.04571757 -1.2724809
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)

#After PCAplot edata_lmY, colored by Study or batch, labels Age
pca0=prcomp(t(edata_lmY), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                            PC1         PC2       PC3
## F_100_AD_GSM1423787  -3.460897  8.87646559 -2.814455
## F_104_CON_GSM663093 -23.363512 11.80882026 16.596438
## F_18_HD_GSM1424374    2.635769 -0.07995706  4.467857
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_lmY",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)


                                #########PCAplot chunk Tissue colored by Study or batch########
#Before PCAplot edata, colored by Study or batch, labels Tissue
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                           PC1         PC2        PC3
## F_100_AD_GSM1423787 -33.05874 -0.57609949  0.4348591
## F_104_CON_GSM663093 197.83254 34.44417247 -0.5855307
## F_18_HD_GSM1424374  -32.57047 -0.04571757 -1.2724809
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Tissue,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)

#After PCAplot edata_lmY, colored by Study or batch, labels Tissue
pca0=prcomp(t(edata_lmY), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                            PC1         PC2       PC3
## F_100_AD_GSM1423787  -3.460897  8.87646559 -2.814455
## F_104_CON_GSM663093 -23.363512 11.80882026 16.596438
## F_18_HD_GSM1424374    2.635769 -0.07995706  4.467857
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_lmY",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Tissue,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE33000_to_GSE43490"),
       text.col=c("red", "blue", "green")
)

dev.off()
## quartz_off_screen 
##                 2

Step11: Preparing aggregate data edata_lmY from metadata by variables

t_edata_lmY<-t(edata_lmY)
DT::datatable(t_edata_lmY[,1:7])
DT::datatable(pheno)
dim(t_edata_lmY)
## [1]  784 6924
dim(pheno)
## [1] 784   5
#bind the pheno and edata_lmY, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_lmY<-cbind(pheno, t_edata_lmY)
dim(pheno_t_edata_lmY)
## [1]  784 6929
DT::datatable(pheno_t_edata_lmY[,1:10])
write.csv(pheno_t_edata_lmY, "Step11_result_pheno_t_edata_lmY_All_Group_aggregate.csv")

Aggregade by Disease

#select the Disease pheno drop others
pheno_t_edata_lmY_Disease=pheno_t_edata_lmY[!names(pheno_t_edata_lmY) %in% c("Age", "Study", "Gender", "Tissue")]
dim(pheno_t_edata_lmY_Disease)
## [1]  784 6925
DT::datatable(pheno_t_edata_lmY_Disease[,1:10])
#aggregate edata_lmY or expression for the Disease pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_lmY_Disease)
pheno_t_edata_lmY_Disease_avg=pheno_t_edata_lmY_Disease[, lapply(.SD, mean), by = .(Disease)]
DT::datatable(pheno_t_edata_lmY_Disease_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_lmY_Disease_avg=t(pheno_t_edata_lmY_Disease_avg[,-1])
colnames(t_pheno_t_edata_lmY_Disease_avg)=pheno_t_edata_lmY_Disease_avg$Disease
dim(t_pheno_t_edata_lmY_Disease_avg)
## [1] 6924    5
DT::datatable(t_pheno_t_edata_lmY_Disease_avg[1:10,])
write.csv(t_pheno_t_edata_lmY_Disease_avg, "Step11_result_edata_lmY_Disease_aggregate.csv")

Aggregade by Gender

#select the Gender pheno drop others
pheno_t_edata_lmY_Gender=pheno_t_edata_lmY[!names(pheno_t_edata_lmY) %in% c("Age", "Study", "Disease", "Tissue")]
dim(pheno_t_edata_lmY_Gender)
## [1]  784 6925
DT::datatable(pheno_t_edata_lmY_Gender[,1:10])
#aggregate edata_lmY or expression for the Gender pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_lmY_Gender)
pheno_t_edata_lmY_Gender_avg=pheno_t_edata_lmY_Gender[, lapply(.SD, mean), by = .(Gender)]
DT::datatable(pheno_t_edata_lmY_Gender_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_lmY_Gender_avg=t(pheno_t_edata_lmY_Gender_avg[,-1])
colnames(t_pheno_t_edata_lmY_Gender_avg)=pheno_t_edata_lmY_Gender_avg$Gender
dim(t_pheno_t_edata_lmY_Gender_avg)
## [1] 6924    2
DT::datatable(t_pheno_t_edata_lmY_Gender_avg[1:10,])
write.csv(t_pheno_t_edata_lmY_Gender_avg, "Step11_result_edata_lmY_Gender_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_lmY_Age=pheno_t_edata_lmY[!names(pheno_t_edata_lmY) %in% c("Disease", "Study", "Gender", "Tissue")]
dim(pheno_t_edata_lmY_Age)
## [1]  784 6925
DT::datatable(pheno_t_edata_lmY_Age[,1:10])
#aggregate edata_lmY or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_lmY_Age)
pheno_t_edata_lmY_Age_avg=pheno_t_edata_lmY_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_lmY_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_lmY_Age_avg=t(pheno_t_edata_lmY_Age_avg[,-1])
colnames(t_pheno_t_edata_lmY_Age_avg)=pheno_t_edata_lmY_Age_avg$Age
dim(t_pheno_t_edata_lmY_Age_avg)
## [1] 6924   77
DT::datatable(t_pheno_t_edata_lmY_Age_avg[1:10,])
write.csv(t_pheno_t_edata_lmY_Age_avg, "Step11_result_edata_lmY_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_lmY_Tissue=pheno_t_edata_lmY[!names(pheno_t_edata_lmY) %in% c("Disease", "Study", "Gender", "Age")]
dim(pheno_t_edata_lmY_Tissue)
## [1]  784 6925
DT::datatable(pheno_t_edata_lmY_Tissue[,1:10])
#aggregate edata_lmY or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_lmY_Tissue)
pheno_t_edata_lmY_Tissue_avg=pheno_t_edata_lmY_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_lmY_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_lmY_Tissue_avg=t(pheno_t_edata_lmY_Tissue_avg[,-1])
colnames(t_pheno_t_edata_lmY_Tissue_avg)=pheno_t_edata_lmY_Tissue_avg$Tissue
dim(t_pheno_t_edata_lmY_Tissue_avg)
## [1] 6924    3
DT::datatable(t_pheno_t_edata_lmY_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_lmY_Tissue_avg, "Step11_result_edata_lmY_Tissue_aggregate.csv")

Aggregade by Study

#select the Study pheno drop others
pheno_t_edata_lmY_Study=pheno_t_edata_lmY[!names(pheno_t_edata_lmY) %in% c("Disease", "Tissue", "Gender", "Age")]
dim(pheno_t_edata_lmY_Study)
## [1]  784 6925
DT::datatable(pheno_t_edata_lmY_Study[,1:10])
#aggregate edata_lmY or expression for the Study pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_lmY_Study)
pheno_t_edata_lmY_Study_avg=pheno_t_edata_lmY_Study[, lapply(.SD, mean), by = .(Study)]
DT::datatable(pheno_t_edata_lmY_Study_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_lmY_Study_avg=t(pheno_t_edata_lmY_Study_avg[,-1])
colnames(t_pheno_t_edata_lmY_Study_avg)=pheno_t_edata_lmY_Study_avg$Study
dim(t_pheno_t_edata_lmY_Study_avg)
## [1] 6924    7
DT::datatable(t_pheno_t_edata_lmY_Study_avg[1:10,])
write.csv(t_pheno_t_edata_lmY_Study_avg, "Step11_result_edata_lmY_Study_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_lmY_all_avg<-cbind(t_pheno_t_edata_lmY_Disease_avg, t_pheno_t_edata_lmY_Gender_avg, t_pheno_t_edata_lmY_Age_avg, t_pheno_t_edata_lmY_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_lmY_all_avg))
write.csv(t_pheno_t_edata_lmY_all_avg, "Step11_result_edata_lmY_All_Group_aggregate.csv")
#save pheno_t_edata_lmY, t_pheno_t_edata_lmY_Disease_avg, t_pheno_t_edata_lmY_Gender_avg, t_pheno_t_edata_lmY_Age_avg, t_pheno_t_edata_lmY_Tissue_avg, t_pheno_t_edata_lmY_Study_avg, t_pheno_t_edata_lmY_all_avg
save(pheno_t_edata_lmY, t_pheno_t_edata_lmY_Disease_avg, t_pheno_t_edata_lmY_Gender_avg, t_pheno_t_edata_lmY_Age_avg, t_pheno_t_edata_lmY_Tissue_avg, t_pheno_t_edata_lmY_Study_avg, t_pheno_t_edata_lmY_all_avg, file="edata_lmY_aggregate_data.RData")

Step11: Visualization by heatmap and correlation of aggregate data logtpm or edata_lmY from metadata by variables

pdf(file="Step11_plot_boxplot_corr_plot_edata_lmY_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot Disease Gender Age Tissue Study and all ########
#box plot on aggregate Disease
boxplot(t_pheno_t_edata_lmY_Disease_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_Disease_avg", col="yellow")

#box plot on aggregate Gender
boxplot(t_pheno_t_edata_lmY_Gender_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_Gender_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_lmY_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_lmY_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_Tissue_avg", col="yellow")

#box plot on aggregate Study
boxplot(t_pheno_t_edata_lmY_Study_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_Study_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_lmY_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_lmY_all_avg", col="yellow")

                    #########corr plot Disease Gender Age and all########
#create cor matrix on aggregate Disease
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_Disease_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_Disease_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Gender
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_Gender_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_Gender_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_Tissue_avg))
## Warning in sqrt(1 - h * h): NaNs produced
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Study
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_Study_avg))
## Warning in sqrt(1 - h * h): NaNs produced
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_Study_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_lmY_all_avg))
## Warning in sqrt(1 - h * h): NaNs produced
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step11_result_edata_lmY_all_Group_corr_pval_aggregate.csv")

dev.off()
## quartz_off_screen 
##                 2
pdf(file="Step11_plot_density_edata_lmY_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot Disease Gender Age Tissue Study and all ########
#density plot on aggregate Disease
df=as.data.frame(t_pheno_t_edata_lmY_Disease_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_Disease_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Disease",x="t_pheno_t_edata_lmY_Disease_avg", y = "Density")


#density plot on aggregate Gender
df=as.data.frame(t_pheno_t_edata_lmY_Gender_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_Gender_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Gender",x="t_pheno_t_edata_lmY_Gender_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_lmY_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_lmY_Age_avg", y = "Density")
  

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_lmY_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_lmY_Tissue_avg", y = "Density")

  
#density plot on aggregate Study
df=as.data.frame(t_pheno_t_edata_lmY_Study_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_Study_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Study",x="t_pheno_t_edata_lmY_Study_avg", y = "Density")


#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_lmY_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_lmY_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_lmY_all_avg", y = "Density") 

dev.off()
## quartz_off_screen 
##                 2

Clear up workspace and retain required dataframes

#save pheno, edata and edata_lmY required for steps below
save(pheno, metadata_1, edata, edata_lmY, file="edata_lmY_data.RData")
#save image
save.image(file="SVA_lmY_temp.RData")

See DEG pipeline, uses edata and modified metadata

Step12: Differentially expressed genes DEG_Limma limma method

See DEG pipelines

Step12: Visualization of differential gene expression results from DEG_Limma

See DEG pipelines

Step13: Keep only DEG_Limma genes from the edata, edata_lmY for combat

See DEG pipelines

Step13: Visualization of before and after SVA with/without DEG_Limma filter

See DEG pipelines

Step14: Differentially expressed genes DEG simple division of arithmetic means of expression data so easier to interpret like cuffdiff (published CGGA code)

See DEG pipelines

Step14: Visualization of differential gene expression results from DEG_Simple

See DEG pipelines

Step15: Keep only DEG_Simple genes from the edata and edata_lmY for combat

See DEG pipelines

Step15: Visualization of before and after SVA with/without DEG_Simple filter

See DEG pipelines

Step16: Differentially expressed genes DEG_edgeR method

See DEG pipelines

Step16: Visualization of differential gene expression results from DEG_edgeR

See DEG pipelines

Step17: Keep only DEG_edgeR genes from the edata and edata_lmY for combat

See DEG pipelines

Step17: Visualization of before and after SVA with/without DEG_edgeR filter

See DEG pipelines

Step18: Differentially expressed genes DEG_Cuff method

See DEG pipelines

Step18: Visualization of differential gene expression results from DEG_Cuff

See DEG pipelines

Step19: Keep only DEG_Cuff genes from the edata and edata_lmY for combat

See DEG pipelines

Step19: Visualization of before and after SVA with/without DEG_Cuff filter

See DEG pipelines

Step20: Comparison of overlap between DEG_Limma DEG_Simple DEG_edgeR and (if available) DEG_cuff

See DEG pipelines

Step20: Visualization of overlap between DEG_Limma DEG_Simple and DEG_edgeR

See DEG pipelines

Step21: Keep only DEG_edgeRLimmaSimple genes from the edata and edata_lmY or combat

See DEG pipelines

Step21: Visualization of before and after SVA with/without DEG_AllDEGmethods filter

See DEG pipelines

End of DEG pipeline, uses edata and modified metadata

Step12:Organization and saving session (software version) information

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] data.table_1.12.2   filesstrings_3.0.0  stringr_1.4.0      
##  [4] corrplot_0.84       viridis_0.5.1       viridisLite_0.3.0  
##  [7] Hmisc_4.2-0         Formula_1.2-3       lattice_0.20-38    
## [10] RColorBrewer_1.1-2  gplots_3.0.1.1      ggmap_3.0.0        
## [13] ggplot2_3.1.1       pamr_1.56.1         survival_2.44-1.1  
## [16] cluster_2.0.8       sva_3.30.1          BiocParallel_1.16.6
## [19] genefilter_1.64.0   mgcv_1.8-28         nlme_3.1-139       
## [22] limma_3.38.3       
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6         matrixStats_0.54.0   bit64_0.9-7         
##  [4] httr_1.4.0           tools_3.5.0          backports_1.1.4     
##  [7] DT_0.5               R6_2.4.0             rpart_4.1-15        
## [10] KernSmooth_2.23-15   DBI_1.0.0            lazyeval_0.2.2      
## [13] BiocGenerics_0.28.0  colorspace_1.4-1     nnet_7.3-12         
## [16] withr_2.1.2          tidyselect_0.2.5     gridExtra_2.3       
## [19] bit_1.1-14           compiler_3.5.0       Biobase_2.42.0      
## [22] htmlTable_1.13.1     labeling_0.3         caTools_1.17.1.2    
## [25] scales_1.0.0         checkmate_1.9.1      digest_0.6.18       
## [28] foreign_0.8-71       rmarkdown_1.12       base64enc_0.1-3     
## [31] jpeg_0.1-8           pkgconfig_2.0.2      htmltools_0.3.6     
## [34] strex_0.1.3          htmlwidgets_1.3      rlang_0.3.4         
## [37] rstudioapi_0.10      RSQLite_2.1.1        shiny_1.3.2         
## [40] jsonlite_1.6         crosstalk_1.0.0      gtools_3.8.1        
## [43] acepack_1.4.1        dplyr_0.8.0.1        RCurl_1.95-4.12     
## [46] magrittr_1.5         Matrix_1.2-17        Rcpp_1.0.1          
## [49] munsell_0.5.0        S4Vectors_0.20.1     stringi_1.4.3       
## [52] yaml_2.2.0           plyr_1.8.4           grid_3.5.0          
## [55] blob_1.1.1           promises_1.0.1       parallel_3.5.0      
## [58] gdata_2.18.0         crayon_1.3.4         splines_3.5.0       
## [61] annotate_1.60.1      knitr_1.22           pillar_1.3.1        
## [64] rjson_0.2.20         reshape2_1.4.3       stats4_3.5.0        
## [67] XML_3.98-1.19        glue_1.3.1           evaluate_0.13       
## [70] latticeExtra_0.6-28  httpuv_1.5.1         png_0.1-7           
## [73] RgoogleMaps_1.4.3    gtable_0.3.0         purrr_0.3.2         
## [76] tidyr_0.8.3          assertthat_0.2.1     xfun_0.6            
## [79] mime_0.6             xtable_1.8-4         later_0.8.0         
## [82] tibble_2.1.1         AnnotationDbi_1.44.0 memoise_1.1.0       
## [85] IRanges_2.16.0
toLatex(sessionInfo())
## \begin{itemize}\raggedright
##   \item R version 3.5.0 (2018-04-23), \verb|x86_64-apple-darwin15.6.0|
##   \item Locale: \verb|en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8|
##   \item Running under: \verb|macOS High Sierra 10.13.6|
##   \item Matrix products: default
##   \item BLAS: \verb|/Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib|
##   \item LAPACK: \verb|/Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib|
##   \item Base packages: base, datasets, graphics, grDevices,
##     methods, stats, utils
##   \item Other packages: BiocParallel~1.16.6, cluster~2.0.8,
##     corrplot~0.84, data.table~1.12.2, filesstrings~3.0.0,
##     Formula~1.2-3, genefilter~1.64.0, ggmap~3.0.0, ggplot2~3.1.1,
##     gplots~3.0.1.1, Hmisc~4.2-0, lattice~0.20-38, limma~3.38.3,
##     mgcv~1.8-28, nlme~3.1-139, pamr~1.56.1, RColorBrewer~1.1-2,
##     stringr~1.4.0, survival~2.44-1.1, sva~3.30.1, viridis~0.5.1,
##     viridisLite~0.3.0
##   \item Loaded via a namespace (and not attached): acepack~1.4.1,
##     annotate~1.60.1, AnnotationDbi~1.44.0, assertthat~0.2.1,
##     backports~1.1.4, base64enc~0.1-3, Biobase~2.42.0,
##     BiocGenerics~0.28.0, bit~1.1-14, bit64~0.9-7, bitops~1.0-6,
##     blob~1.1.1, caTools~1.17.1.2, checkmate~1.9.1,
##     colorspace~1.4-1, compiler~3.5.0, crayon~1.3.4,
##     crosstalk~1.0.0, DBI~1.0.0, digest~0.6.18, dplyr~0.8.0.1,
##     DT~0.5, evaluate~0.13, foreign~0.8-71, gdata~2.18.0,
##     glue~1.3.1, grid~3.5.0, gridExtra~2.3, gtable~0.3.0,
##     gtools~3.8.1, htmlTable~1.13.1, htmltools~0.3.6,
##     htmlwidgets~1.3, httpuv~1.5.1, httr~1.4.0, IRanges~2.16.0,
##     jpeg~0.1-8, jsonlite~1.6, KernSmooth~2.23-15, knitr~1.22,
##     labeling~0.3, later~0.8.0, latticeExtra~0.6-28,
##     lazyeval~0.2.2, magrittr~1.5, Matrix~1.2-17,
##     matrixStats~0.54.0, memoise~1.1.0, mime~0.6, munsell~0.5.0,
##     nnet~7.3-12, parallel~3.5.0, pillar~1.3.1, pkgconfig~2.0.2,
##     plyr~1.8.4, png~0.1-7, promises~1.0.1, purrr~0.3.2, R6~2.4.0,
##     Rcpp~1.0.1, RCurl~1.95-4.12, reshape2~1.4.3,
##     RgoogleMaps~1.4.3, rjson~0.2.20, rlang~0.3.4, rmarkdown~1.12,
##     rpart~4.1-15, RSQLite~2.1.1, rstudioapi~0.10,
##     S4Vectors~0.20.1, scales~1.0.0, shiny~1.3.2, splines~3.5.0,
##     stats4~3.5.0, strex~0.1.3, stringi~1.4.3, tibble~2.1.1,
##     tidyr~0.8.3, tidyselect~0.2.5, tools~3.5.0, withr~2.1.2,
##     xfun~0.6, XML~3.98-1.19, xtable~1.8-4, yaml~2.2.0
## \end{itemize}
#Organize of files
#library(filesstrings)

#raw, expr normalized and metadata 
dir.create("merged_expr_metadata")
file.move(list.files(pattern = '*rawdata_transformation_FewSamples.pdf'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*rawdata_transformation.pdf'), "merged_expr_metadata")
## 7 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID.csv'), "merged_expr_metadata")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'preSVA_Merging.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'done_pre_Step8.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'GPL*'), "merged_expr_metadata")
## 6 files moved. 0 failed.
file.move(list.files(pattern = '*data_non-log'), "merged_expr_metadata")
## 7 files moved. 0 failed.
file.move(list.files(pattern = '*data_originalfromGEO'), "merged_expr_metadata")
## 7 files moved. 0 failed.
file.move(list.files(pattern = '*Annotation_Expr_GeneHu.csv'), "merged_expr_metadata")
## 7 files moved. 0 failed.
file.move(list.files(pattern = '*metadata.csv'), "merged_expr_metadata")
## 7 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'preSVA*'), "merged_expr_metadata")
## 7 files moved. 0 failed.
#Aggregate expression by variable  and plots
dir.create("Aggregate_csv_plots")
file.move(list.files(pattern = '*aggregate.csv'), "Aggregate_csv_plots")
## 26 files moved. 0 failed.
file.move(list.files(pattern = '*aggregate.pdf'), "Aggregate_csv_plots")
## 4 files moved. 0 failed.
file.move(list.files(pattern = 'edata_aggregate_data.RData'), "Aggregate_csv_plots")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_lmY_aggregate_data.RData'), "Aggregate_csv_plots")
## 1 file moved. 0 failed.
#SVA_lmY results and plots
dir.create("SVA_lmY")
file.move(list.files(pattern = '*edata_lmY.csv'), "SVA_lmY")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*edata.csv'), "SVA_lmY")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'Step10_plot*'), "SVA_lmY")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_lmY_data.RData'), "SVA_lmY")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'SVA_lmY_temp.RData'), "SVA_lmY")
## 1 file moved. 0 failed.
#remove extra files
file.remove(list.files(pattern ='VennDiagram*'))
## logical(0)
file.remove(list.files(pattern ='temp.RData'))
## [1] TRUE TRUE
#Remove .RData and clear environment to free up memory
rm(list=ls())
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells 3893143 208.0   16728036  893.4         NA   20910045  1116.8
## Vcells 8604933  65.7 1084409974 8273.4     102400 1355694339 10343.2